package com.amos.isearch.common.parser;

import lombok.extern.slf4j.Slf4j;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * DESCRIPTION: pdf parser
 *
 * @author <a href="mailto:daoyuan0626@gmail.com">amos.wang</a>
 * @date 2021/1/23
 */
@Slf4j
public class PdfParser {

    private static final Tika TIKA = new Tika();

    public static String getFileType(File file) throws IOException {
        return TIKA.detect(file);
    }

    public static String getContent(File file) {
        try {
            String type = getFileType(file);
            log.debug("文件类型 [{}]", type);

            return TIKA.parseToString(new FileInputStream(file));
        } catch (TikaException | IOException e) {
            e.printStackTrace();
        }

        return null;
    }

}
